"""Load datasets used by figures and models."""

import hashlib
import os
import random

import pandas as pd

DATA_PATH = os.path.join(os.path.dirname(__file__), '..', 'data')
ECONOMY_DATA_PATH = os.path.join(DATA_PATH, 'economy')
NOVELS_DATA_PATH = os.path.join(DATA_PATH, 'novels')
MORETTI_TITLES_FILENAME = os.path.join(NOVELS_DATA_PATH, 'TITLES_1600-1850_moretti.xls')
NOVELS_PROJECT_FILENAME = os.path.join(NOVELS_DATA_PATH, 'novels-project-works.json')
NOVELS_PROJECT_FILENAME_SHA256 = '48671e8ec3bae84071415ec6b29dddf64af86dc12f8aaab3da29b1660ca62053'
ELIOT_FILENAME = os.path.join(NOVELS_DATA_PATH, 'eliot-1997-nstc-publishers-circular-v1.csv')
ELIOT_FILENAME_SHA256 = '3d177ef33f2430e98c9a3d5cca7efdce3f50d5d889241154835f2649f33a8ecc'
NSTC_FILENAME = os.path.join(NOVELS_DATA_PATH, 'nstc-1801-1870-loced-eliot-1997.csv')
CASEY_FILENAME = os.path.join(NOVELS_DATA_PATH, 'casey-1996-athenaeum-novels.csv')
BASSETT_AT_THE_CIRCULATING_LIBRARY_PRIORS_FILENAME = os.path.join(NOVELS_DATA_PATH,
                                                                  'bassett-atcl-riddell-elicited-prior.csv')
ANDREW_BLOCK_FILENAME = os.path.join(NOVELS_DATA_PATH, 'andrew-block', 'andrew-block-grep-1815-1850.csv')
LITERACY_FILENAME = os.path.join(ECONOMY_DATA_PATH, 'average-literacy-england-clark-figure7.csv')
POPULATION_FILENAME = os.path.join(ECONOMY_DATA_PATH, 'population-by-country.csv')
MADDISON_FILENAME = os.path.join(ECONOMY_DATA_PATH, 'vertical-file_02-2010.xls')
ALLEN_FILENAME = os.path.join(ECONOMY_DATA_PATH, 'historical_real_wage_allen_fig1.csv')


def _historical_real_wages():
    """Historical Real Wages from Allen (2009).

    Data crudely extracted from Figure 1 in Allen (2009).

    """
    return pd.read_csv(ALLEN_FILENAME, index_col=0)


def _literacy_rates():
    """Literacy Rates from Clark (2005).

    NOTE: these literacy rates are crudely extracted from Clark's Figure 7.

    Clark, Gregory. “The Condition of the Working Class in England, 1209–2004.”
    Journal of Political Economy 113, no. 6 (December 1, 2005): 1307–40.
    doi:10.1086/498123.

    """
    literacy = pd.read_csv(LITERACY_FILENAME, index_col=0)
    literacy.columns = ['men_england', 'women_england']
    return literacy


def _population_british_isles():
    """Population of UK plus Ireland from Maddison, 1820-1919.

    Note: More data like this is available from the Clio-Infra project.

    """
    maddison = pd.read_excel(MADDISON_FILENAME, sheetname='Population', skiprows=2, index_col=0)
    maddison = maddison.loc[1820:1919, ['UK', 'Ireland ']]  # note space in Ireland column name
    maddison.index = maddison.index.astype(int)
    maddison.index.name = 'year'
    assert maddison.notnull().all().all()
    maddison = 1000 * maddison
    british_isles = maddison.sum(axis=1)
    british_isles.name = 'british_isles_population'
    return british_isles


def _moretti_title_counts():
    """Counts from Moretti's Style, Inc. dataset.

    This is the original dataset used for Franco Moretti's "Style, Inc.
    Reflections on Seven Thousand Titles (British Novels, 1740 –1850)".

    NOTE: DO NOT USE THIS DATASET!!! IT CONTAINS COUNTS FROM A
    MIXTURE OF SOURCES. THESE SOURCES USE RADICALLY DIFFERENT DEFINITIONS OF
    THE NOVEL.

    Data on titles published before 1770 are not used. The remaining data are
    from the following three sources:

    - 1770–1829: Peter Garside et al., The English Novel, 1770 –1829: A
      Bibliographical Survey of Prose Fiction Published in the British Isles, 2
      vols. (Oxford, 2000)

    - 1830–36: “The British Novel 1830–36: A Bibliographical Survey of Fiction
      Published in the British Isles,” ed. Garside et al.,
      www.cardiff.ac.uk/encap/journals/corvey/1830s/index.html

    - 1837–50: Andrew Block, The English Novel, 1740 –1850: A Catalogue
      Including Prose Romances, Short Stories, and Translations of Foreign
      Fiction, 2d ed. (London, 1961)

    Block uses a far more inclusive definition of the novel than the other
    sources.  Moretti notes this: "The chart stops in 1836 because it seems
    very likely that Andrew Block’s bibliography significantly overstates the
    number of novels published after that date." (p. 140)

    Returns:

        pandas.Series: Number of titles published by year.
    """
    moretti_titles = pd.read_excel(MORETTI_TITLES_FILENAME, 0)[['Year']].groupby('Year').size()
    moretti_titles.index.name = 'year'
    moretti_titles.columns = ['moretti_titles_count']
    return moretti_titles.sort_index().loc[1770:]


def garside_schöwerling():
    """Load and return the Garside and Schöwerling dataset."""
    # check sha256 to verify we are dealing with the right file
    with open(NOVELS_PROJECT_FILENAME, 'rb') as f:
        sha256 = hashlib.sha256(f.read()).hexdigest()
        assert sha256 == NOVELS_PROJECT_FILENAME_SHA256, (sha256, NOVELS_PROJECT_FILENAME_SHA256)

    # load data, restrict to years for which the data is exhaustive
    df = pd.read_json(NOVELS_PROJECT_FILENAME, orient='index')
    df.index = df.index.values.astype(int)
    df = df[(df['year'] >= 1800) & (df['year'] <= 1836)]

    # manually add gender annotation (from British Fiction database) for these authors in updates
    df.loc[2899, 'gender'] = 'Male'
    df.loc[2900, 'gender'] = 'Female'
    df.loc[2901, 'gender'] = 'Unknown'
    df.loc[2902, 'gender'] = 'Unknown'

    # fix publication/publisher swap
    df.loc[2899, 'publication'] = df.loc[2899, 'publisher']
    df.loc[2900, 'publication'] = df.loc[2900, 'publisher']
    df.loc[2901, 'publication'] = df.loc[2901, 'publisher']
    df.loc[2902, 'publication'] = df.loc[2902, 'publisher']

    # copy 'publisher' to 'publication' whenever it is missing
    df.loc[df['publication'].isnull(), 'publication'] = df.loc[df['publication'].isnull(), 'publisher']

    # sanity checks
    assert df['publication'].notnull().all()
    return df


def garside_schöwerling_title_counts():
    """Counts of new titles published from the Garside and Schöwerling bibliography.

    These counts were retrieved from the British Fiction online database and
    differ slightly from the counts Moretti has. The difference is less than 1%.

    Returns:

        pandas.Series: Number of titles published by year.
    """
    df = garside_schöwerling()
    # Limit to 1800-1836 inclusive
    counts = df.groupby('year').size().sort_index().loc[1800:1836]
    # Double check against Moretti's counts
    # Moretti's counts differ from my records. The difference is small.
    moretti_1800_1836 = _moretti_title_counts().loc[1800:1836]
    assert (counts - moretti_1800_1836).sum() < 20
    assert (counts - moretti_1800_1836).sum() / counts.sum() < 0.01
    assert (counts - moretti_1800_1836).abs().sum() < 30
    counts.name = 'novels'
    return counts


def garside_schöwerling_title_counts_by_gender():
    """Counts of new titles published from the Garside and Schöwerling bibliography.

    These counts were retrieved from the British Fiction online database and
    differ slightly from the counts Moretti has. The difference is less than 1%.

    Author gender is only recorded for titles published between 1800 and 1829 (inclusive).

    Returns:

        pandas.Series: Number of titles published by year.
    """
    df = garside_schöwerling()
    # Limit to 1800-1829 inclusive
    counts = df.groupby(('year', 'gender')).size().sort_index().loc[1800:1829].unstack()
    assert counts.shape[0] == 30
    counts.columns = [f'novels_{gender.lower()}' for gender in counts.columns]
    return counts


def publishers_circular():
    """Counts from Publishers' Circular.

    These counts are from Eliot (1994) and are discussed in Weedon (2003), p. 45-47.

    Weedon notes that "The Publishers' Circular's methods of data collection
    were being established during 1840-1842 and therefore the figures for these
    years are tentative."

    Given this only the years 1843-1919 are used.

    Returns:

        pandas.Series: Number of Publishers' Circular editions
    """
    # check sha256 to verify we are dealing with the right file
    with open(ELIOT_FILENAME, 'rb') as f:
        sha256 = hashlib.sha256(f.read()).hexdigest()
        assert sha256 == ELIOT_FILENAME_SHA256, (sha256, ELIOT_FILENAME_SHA256)
    eliot = pd.read_csv(ELIOT_FILENAME, index_col=0).sort_index()
    pc = eliot.loc[1843:, 'publishers_circular_editions'].sort_index()
    assert pc.notnull().all(), pc.loc[pc.isnull()]
    assert pc.index.max() == 1919
    assert pc.index.min() == 1843
    assert pc.index.is_unique
    assert len(pc) == 1919 - 1843 + 1
    return pc.astype(int)


def nineteenth_century_short_title_catalogue_loced():
    """Counts from Nineteenth Century Short Title Catalogue (NSTC)

    Only titles with an imprint location of London, Oxford, Cambridge,
    Edinburgh, or Dublin are counted.

    These counts are from Eliot (1997) and are discussed in Weedon (2003), p.  45-47.
    Covers 1801-1870 inclusive.

    Returns:

        pandas.Series: NSTC counts from Eliot (1997)

    """
    column_name = 'nstc_imprint_location_london_oxford_cambridge_edinburgh_dublin'
    nstc = pd.read_csv(NSTC_FILENAME, index_col=0).sort_index()[column_name]
    assert nstc.notnull().all(), nstc.loc[nstc.isnull()]
    return nstc


def raven_forster_1789_1799():
    """Counts of new titles published from the Raven and Forster bibliography.

    Returns:

        pandas.Series: Number of titles published by year.

    """
    # Entered by hand. These appear in Table 8 in Raven and Forster.
    years = range(1789, 1799 + 1)
    counts = (71, 74, 74, 58, 45, 56, 50, 91, 79, 75, 99)
    return pd.Series(data=counts, name='novels', index=years)


def raven_forster_1789_1799_random_sample_indices(size=10):
    """Random sample 10 titles from each year in Raven and Forster.

    The author gender of these titles will be manually coded.

    Returns:

        dict: Keys are years, values are draws.

    """
    series = raven_forster_1789_1799()
    random.seed(1)
    num_draws = 10
    draws = {}
    for year, count in series.items():
        draws[year] = tuple(sorted(random.randrange(count) + 1 for _ in range(num_draws)))
    return draws


def raven_forster_1789_1799_random_sample_by_gender():
    """Random sample 10 titles from each year in Raven and Forster.

    The author gender of these titles has been manually coded.

    Returns:

        pandas.DataFrame: Keys are years, values are draws.

    """
    years = range(1789, 1799 + 1)
    data = {
        'novels_men':  (1, 1, 3, 4, 3, 5, 5, 4, 0, 2, 2),
        'novels_women': (5, 4, 3, 6, 5, 3, 5, 5, 6, 7, 8),
        'novels_unknown': (4, 5, 4, 0, 2, 2, 0, 1, 4, 1, 0),
    }
    assert all(len(vals) == 11 for vals in data.values())
    df = pd.DataFrame(data, index=years)
    assert (df.sum(axis=1) == 10).all()
    return df


def andrew_block():
    """Counts of titles included in Andrew Block's bibliography.

    NOTE: DO NOT USE THIS DATA! Andrew Block includes many titles in his
    bibliography which are not novels by any prevailing definition!

    The method for extracting title counts is crude. Title "counts" for a given
    year are simply the number of OCR'd text lines which contain the year.

    Returns:
        Series: title counts for Block 1815-1850
    """
    andrew_block = pd.read_csv(ANDREW_BLOCK_FILENAME, index_col=0)['andrew_block_counts'].sort_index()
    assert len(andrew_block) == (1850 - 1815 + 1)
    assert andrew_block.index.is_unique
    return andrew_block


def andrew_block_discount():
    """Estimate of the degree to which Block overestimates true number of new titles.

    Based on a very small sample of the 108 titles in Block which have a
    publication year of 1836. 77 of these titles are also in Garside and
    Schöwerling (GS). 31 titles (mostly juvenile fiction and religious texts)
    are present in Block but not present in GS. There are 78 titles in GS with
    a publication date of 1836, implying that Block likely failed to mention
    only one (or mentioned it but gave an incorrect publication year).

    Since GS is the reference, a very rough estimate of Block's relationship
    to GS is that Block is inflated by 108/78, approximately 38%. So to recover
    the GS rate from Block one would multiply Block by 78 / 108

    """
    return 78 / 108


def casey_athenaeum_novels():
    """Counts of novels reviewed in the *Athenaeum*.

    Source: Casey (1996). "Edging Women out?: Reviews of Women Novelists in the
    "Athenaeum," 1860-1900"

    Returns:
        pandas.Series: Counts of novels reviewed

    """
    casey = pd.read_csv(CASEY_FILENAME, index_col=0)
    casey = casey.rename(columns={'total_novels_reviewed': 'athenaeum_novels_reviewed'})
    return casey['athenaeum_novels_reviewed']


def casey_athenaeum_novels_by_gender():
    """Counts of novels reviewed in the *Athenaeum* by gender.

    Source: Casey (1996). "Edging Women out?: Reviews of Women Novelists in the
    "Athenaeum," 1860-1900"

    Returns:
        pandas.DataFrame: Counts of novels reviewed by gender.

    """
    df = pd.read_csv(CASEY_FILENAME, index_col=0)
    df['unknown_authors'] = df['total_novels_reviewed'] - df['male_authors_floor'] - df['female_authors_floor']
    df = df.rename(columns={
        'total_novels_reviewed': 'athenaeum_novels_reviewed',
        'male_authors_floor': 'athenaeum_novels_reviewed_men',
        'female_authors_floor': 'athenaeum_novels_reviewed_women',
        'unknown_authors': 'athenaeum_novels_reviewed_unknown_gender',
    })
    return df


def bassett_at_the_circulating_library_priors_discounted():
    """Return Bassett's estimate of Garside, Raven, and Schöwerling's relative undercounting rate.

    Bassett reports that his definition of a novel includes 10% more novels
    than Garside, Raven, and Schöwerling's definition.

    """
    return bassett_at_the_circulating_library_priors() * (1 - 0.125)


def bassett_at_the_circulating_library_priors():
    """Bassett elicited priors for novels published in 1886, 1891, and 1894.

    NOTE: These are Bassett's priors for the total number of novels which will
    eventually be listed on *At the Circulating Library*. Bassett uses a similar
    but distinct definition of "novel" as Garside and Schöwerling. Bassett estimates
    that the definition of "novel" used by Garside and Schöwerling would exclude 10% of
    the novels in *At the Circulating Library*.

    See the function `bassett_at_the_circulating_library_discounted`.

    Returns:
        pandas.DataFrame: lower bound, 25-percentile, median, 75-percentile for each year.

    """
    df = pd.read_csv(BASSETT_AT_THE_CIRCULATING_LIBRARY_PRIORS_FILENAME, index_col=0)
    columns_old = ['25_percentile', '50_percentile', '75_percentile']
    columns_new = ['bassett_25_percentile', 'bassett_50_percentile', 'bassett_75_percentile']
    df = df.rename(columns=dict(zip(columns_old, columns_new)))
    return df


if __name__ == '__main__':
    print('running superficial checks')
    assert _moretti_title_counts() is not None
    assert garside_schöwerling() is not None
    assert garside_schöwerling_title_counts() is not None
    assert publishers_circular() is not None
    assert nineteenth_century_short_title_catalogue_loced() is not None
    assert raven_forster_1770_1799() is not None
    assert andrew_block() is not None
    assert andrew_block_discount() is not None
    assert casey_athenaeum_novels() is not None
    assert bassett_at_the_circulating_library_priors_discounted() is not None
    assert bassett_at_the_circulating_library_priors() is not None
    print('done running superficial checks')
